Github url: https://github.com/nwh84/angsd-project All bash scripts are in github repo
$ sbatch align_all.sh
sbatch run_fastQC.sh $SCRATCH_DIR/fastqc_out
Output .html file in github This shows the summary of the samtools output and the multiqc output. We can see that SRR8368044_BP1 has far more reads mapped than the other samples. However, the total sequences count for the paired end reads looks similar to the other sequences. The high number of mapped reads may be due to a high level of PCR duplication or because the sample has many repetitive regions. When we look at fastQC results, we don’t see high levels of duplicate reads for this sample, so PCR duplication is probably not the culprit. We can see in the flagstat results that this sample has a large number of total reads, mapped reads and secondary alignment. Therefore, maybe the high number of mapped reads is due to repetitive regions that lead to multiple mapping and more unique reads/higher gene expression. We also see a spike in GC content for many on the experimental groups which might be due to contamination or a biological effect.
$ rename SRR8367773 SRR8367773_control SRR8367773*
$ rename SRR8367783 SRR8367783_control SRR8367783*
$ rename SRR8367785 SRR8367785_control SRR8367785*
$ rename SRR8367786 SRR8367786_control SRR8367786*
$ rename SRR8367787 SRR8367787_control SRR8367787*
$ rename SRR8367789 SRR8367789_control SRR8367789*
$ rename SRR8368044 SRR8368044_BP1 SRR8368044*
$ rename SRR8368048 SRR8368048_BP1 SRR8368048*
$ rename SRR8368055 SRR8368055_BP1 SRR8368055*
$ rename SRR8368061 SRR8368061_BP1 SRR8368061*
$ rename SRR8368072 SRR8368072_BP1 SRR8368072*
$ rename SRR8368152 SRR8368152_BP1 SRR8368152*
$ cd $SCRATCH_DIR
$ featureCounts -a hg38.ensGene.gtf -o gene_counts/hg38_fc alignments/SRR8367773_control.Aligned.sortedByCoord.out.bam alignments/SRR8367783_control.Aligned.sortedByCoord.out.bam alignments/SRR8367785_control.Aligned.sortedByCoord.out.bam
alignments/SRR8367786_control.Aligned.sortedByCoord.out.bam
alignments/SRR8367787_control.Aligned.sortedByCoord.out.bam
alignments/SRR8367789_control.Aligned.sortedByCoord.out.bam
alignments/SRR8368044_BP1.Aligned.sortedByCoord.out.bam
alignments/SRR8368048_BP1.Aligned.sortedByCoord.out.bam
alignments/SRR8368055_BP1.Aligned.sortedByCoord.out.bam
alignments/SRR8368061_BP1.Aligned.sortedByCoord.out.bam
alignments/SRR8368072_BP1.Aligned.sortedByCoord.out.bam
alignments/SRR8368152_BP1.Aligned.sortedByCoord.out.bam -s 2 -p --countReadPairs -C
parameters: -s 1, perform stranded read counting -p, Specify that input data contain paired-end reads –countReadPairs, Count read pairs (fragments) instead of reads. -C, Do not count read pairs that have their two ends mapping to different chromosomes or mapping to same chromosome but on different strands.
read_counts_summary <- read.table('/Users/noelawheeler/Desktop/Analysis of next gen seq data/angsd-project/hg38_fc.summary')
read_counts <- read.table('/Users/noelawheeler/Desktop/Analysis of next gen seq data/angsd-project/hg38_fc', header = TRUE)
run qorts for quality control
$ mamba activate qorts
$ sbatch run_qorts.sh $SCRATCH_DIR/alignments
install.packages("http://hartleys.github.io/QoRTs/QoRTs_LATEST.tar.gz",
repos = NULL,
type="source")
library(QoRTs)
# make decoder file
incompleteDecoder <- data.frame(unique.ID = c("SRR8367773_control","SRR8367783_control","SRR8367785_control", "SRR8367786_control", "SRR8367787_control", "SRR8367789_control", "SRR8368044_BP1", "SRR8368048_BP1", "SRR8368055_BP1", "SRR8368061_BP1", "SRR8368072_BP1", "SRR8368152_BP1"),
group.ID = c("CONTROL","CONTROL","CONTROL","CONTROL","CONTROL","CONTROL","CASE","CASE","CASE","CASE","CASE","CASE"), qc.data.dir = c("qort_out/SRR8367773_control","qort_out/SRR8367783_control", "qort_out/SRR8367785_control", "qort_out/SRR8367786_control", "qort_out/SRR8367787_control", "qort_out/SRR8367789_control", "qort_out/SRR8368044_BP1", "qort_out/SRR8368048_BP1", "qort_out/SRR8368055_BP1", "qort_out/SRR8368061_BP1", "qort_out/SRR8368072_BP1", "qort_out/SRR8368052_BP1"));
decoder <- completeAndCheckDecoder(incompleteDecoder)
## column 'qc.data.prefix' not found in the decoder, assuming qc.data.prefix = ""
# read in qort results
#res <- read.qc.results.data("/Users/noelawheeler/Desktop/Analysis of next gen seq data/angsd-project", decoder = decoder, calc.DESeq2 = TRUE, calc.edgeR = TRUE)
# generate multi-plot figures (do once)
# makeMultiPlot.all(res,
# outfile.dir = "/Users/noelawheeler/Desktop/Analysis of next gen seq data/angsd-project/qort_out/summaryPlots/",
# plot.device.name = "png")
The Phred quality score looks good for all samples as does gene body
coverage. Some issues are that one experimental sample has very high MT
content. This might explain why we get really high read counts in that
sample for SRR8368044_BP1. We also see that most of the experimental
groups have a spike in GC content. This seems strange and might mean
some kind of contamination or maybe some biological effect.
run deSeq2 for processing
library(ggplot2); theme_set(theme_bw(base_size = 16))
library(magrittr)
library(DESeq2)
# fix column names
orig_names <- names(read_counts)
sample_names <- gsub("^alignments\\.|\\.Aligned.*$", "", orig_names)
names(read_counts) <- sample_names
row.names(read_counts) <- make.names(read_counts$Geneid)
# get rid of unnecessary columns
readcounts <- read_counts[ , -c(1:6)]
# get sample infor
sample_info <- data.frame(condition = gsub("SRR[0-9]+_", "", names(readcounts)), row.names = names(readcounts))
sample_info
## condition
## SRR8367773_control control
## SRR8367783_control control
## SRR8367785_control control
## SRR8367786_control control
## SRR8367787_control control
## SRR8367789_control control
## SRR8368044_BP1 BP1
## SRR8368048_BP1 BP1
## SRR8368055_BP1 BP1
## SRR8368061_BP1 BP1
## SRR8368072_BP1 BP1
## SRR8368152_BP1 BP1
# make Deseq object
DESeq.ds <- DESeqDataSetFromMatrix(countData = as.matrix(readcounts),
colData = sample_info,
design = ~ condition)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
DESeq.ds
## class: DESeqDataSet
## dim: 64252 12
## metadata(1): version
## assays(1): counts
## rownames(64252): ENSG00000223972 ENSG00000227232 ... ENSG00000231514
## ENSG00000235857
## rowData names(0):
## colnames(12): SRR8367773_control SRR8367783_control ... SRR8368072_BP1
## SRR8368152_BP1
## colData names(1): condition
# option to drop sample SRR8368044_BP1 because of mitochondrial contamination
readcounts %<>% dplyr::select(-SRR8368044_BP1)
# create new sample_info object
sample_info <- data.frame(condition = gsub("SRR[0-9]+_", "", names(readcounts)), row.names = names(readcounts))
# new deseq object
DESeq.ds <- DESeqDataSetFromMatrix(countData = as.matrix(readcounts),
colData = sample_info,
design = ~ condition)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
DESeq.ds
## class: DESeqDataSet
## dim: 64252 11
## metadata(1): version
## assays(1): counts
## rownames(64252): ENSG00000223972 ENSG00000227232 ... ENSG00000231514
## ENSG00000235857
## rowData names(0):
## colnames(11): SRR8367773_control SRR8367783_control ... SRR8368072_BP1
## SRR8368152_BP1
## colData names(1): condition
val <- colSums(counts(DESeq.ds))
barplot(val, las = 2)
# remove genes with no reads
keep_genes <- rowSums(counts(DESeq.ds)) > 0
DESeq.ds <- DESeq.ds[ keep_genes, ]
Normalize for sequence depth
# calculate size factors
DESeq.ds <- estimateSizeFactors(DESeq.ds)
## plot size factors
plot( sizeFactors(DESeq.ds), colSums(counts(DESeq.ds)), # assess them
ylab = "library sizes", xlab = "size factors", cex = .6 )
counts.sf_normalized <- counts(DESeq.ds, normalized=TRUE)
boxplot(counts(DESeq.ds), main = "read counts only", cex = .6, las = 2)
boxplot(counts.sf_normalized, main = "SF normalized", cex = .6, las =2)
We can see that a lot of the samples have pretty extreme outliers. Ater normalization these go down somewhat, although we still see very high counts for the one SRR8368061_BP1 largest outlier gene.
Look at log scaled axis
par(mfrow=c(1,2)) # to plot the two box plots next to each other
## bp of non-normalized
boxplot(log2(counts(DESeq.ds) +1), notch=TRUE,
main = "Non-normalized read counts",
ylab ="log2(read counts)", cex = .6, las = 2)
## Warning in (function (z, notch = FALSE, width = NULL, varwidth = FALSE, : some
## notches went outside hinges ('box'): maybe set notch=FALSE
## bp of size-factor normalized values
boxplot(log2(counts(DESeq.ds, normalized=TRUE) +1), notch=TRUE,
main = "Size-factor-normalized read counts",
ylab ="log2(read counts)", cex = .6, las = 2,
mar = c(10, 4, 4, 2), cex.axis = 0.8)
## Warning in (function (z, notch = FALSE, width = NULL, varwidth = FALSE, : some
## notches went outside hinges ('box'): maybe set notch=FALSE
When we view these on a log scale the samples look more similarly distributed than before.
# assign the log counts and log norm counts to a distinct matrix within the DESeq.ds object
assay(DESeq.ds, "log.counts") <- log2(counts(DESeq.ds, normalized = FALSE) + 1)
assay(DESeq.ds, "log.norm.counts") <- log2(counts(DESeq.ds, normalized=TRUE) + 1)
# reduce the dependence of the variance
DESeq.rlog <- rlog(DESeq.ds, blind = TRUE)
# plot to view difference in variance
plot(assay(DESeq.ds, "log.norm.counts")[,1:2], cex=.1, main = "size factor and log2-transformed")
plot(assay(DESeq.rlog)[,1:2],
cex=.1, main = "rlog transformed",
xlab = colnames(assay(DESeq.rlog[,1])),
ylab = colnames(assay(DESeq.rlog[,2])) )
Sample clustering
library(pheatmap)
sampleDists <- dist(t(assay(DESeq.rlog)))
sampleDistMatrix <- as.matrix(sampleDists)
pheatmap(sampleDistMatrix)
The heatmaps shows fairly predictable behavior except for the one control sample that clusters more closely with the BP1 samples than the control. We can also see that most of the BP1 samples are more closely correlated with each other than the control samples are.
plotPCA(DESeq.rlog)
If we look at the PC1 with 60% of the variance, we can see that most of the BP1 and control samples cluster together except one control sample that clusters close to the BP1 samples. With regards to PC2, we do not really see clustering between samples.
Perform differential expression analysis
# put control samples as baseline
DESeq.ds$condition %<>% relevel(ref="control")
# check that contrast is set up correctly
if (design(DESeq.ds) != ~condition){
print("We want condition to be fixed effect in DESeq.ds")
}
# final steps in testing differential expression
DESeq.ds %<>% estimateDispersions()
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
DESeq.ds %<>% nbinomWaldTest()
DESeq.ds
## class: DESeqDataSet
## dim: 52197 11
## metadata(1): version
## assays(6): counts log.counts ... H cooks
## rownames(52197): ENSG00000227232 ENSG00000278267 ... ENSG00000215506
## ENSG00000237917
## rowData names(22): baseMean baseVar ... deviance maxCooks
## colnames(11): SRR8367773_control SRR8367783_control ... SRR8368072_BP1
## SRR8368152_BP1
## colData names(2): condition sizeFactor
Raw distribution of P-values
rowData(DESeq.ds)@listData[["WaldPvalue_condition_BP1_vs_control"]] %>% hist(main="Raw p-values for BP1 vs control")
DGE.results <- results(DESeq.ds, independentFiltering = TRUE, alpha = 0.05)
head(DGE.results)
## log2 fold change (MLE): condition BP1 vs control
## Wald test p-value: condition BP1 vs control
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSG00000227232 3.267594 -0.885392 0.772908 -1.145533 0.251988
## ENSG00000278267 1.288781 -1.236665 1.351152 -0.915267 0.360051
## ENSG00000240361 0.324008 -1.471616 3.146236 -0.467739 0.639972
## ENSG00000186092 0.170250 -0.948072 3.149755 -0.300999 0.763415
## ENSG00000238009 2.538705 -1.748866 1.107147 -1.579615 0.114195
## ENSG00000233750 0.324008 -1.471616 3.146236 -0.467739 0.639972
## padj
## <numeric>
## ENSG00000227232 0.349331
## ENSG00000278267 0.464943
## ENSG00000240361 NA
## ENSG00000186092 NA
## ENSG00000238009 0.181595
## ENSG00000233750 NA
summary(DGE.results)
##
## out of 52197 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 2143, 4.1%
## LFC < 0 (down) : 11984, 23%
## outliers [1] : 21, 0.04%
## low counts [2] : 19222, 37%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
DGE.results$padj %>% hist(breaks=19, main="Adjusted p-values for control vs BP1")
We can see that less genes that are now significant which is the
expected result.
Find important genes
DGE.results.sorted <- DGE.results %>% `[`(order(.$padj),)
head(DGE.results.sorted)
## log2 fold change (MLE): condition BP1 vs control
## Wald test p-value: condition BP1 vs control
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSG00000198695 534.9659 -7.34126 0.689151 -10.65262 1.69524e-26
## ENSG00000212443 292.5063 3.84884 0.390444 9.85760 6.35493e-23
## ENSG00000279978 603.0730 -2.73890 0.305699 -8.95947 3.26249e-19
## ENSG00000210107 82.2444 -7.45760 0.840365 -8.87423 7.04164e-19
## ENSG00000198786 359.8168 -3.67935 0.429672 -8.56316 1.09811e-17
## ENSG00000198216 101.2343 -2.29934 0.296174 -7.76346 8.26448e-15
## padj
## <numeric>
## ENSG00000198695 5.58649e-22
## ENSG00000212443 1.04710e-18
## ENSG00000279978 3.58374e-15
## ENSG00000210107 5.80126e-15
## ENSG00000198786 7.23744e-14
## ENSG00000198216 4.53913e-11
par(mfrow=c(1,2))
plotCounts(DESeq.ds, gene="ENSG00000198695", normalized = TRUE, xlab="")
plotCounts(DESeq.ds, gene = which.max(DGE.results$padj), xlab="",
main = "Gene with max. p.adj.\n(=least significant)")
heatmap of differentially expressed genes
library(pheatmap)
# identify genes with the desired adjusted p-value cut-off
DGEgenes <- rownames(subset(DGE.results.sorted, padj < 0.05))
rlog.dge <- DESeq.rlog[DGEgenes,] %>% assay
pheatmap(rlog.dge, scale="row",
show_rownames=FALSE, main="DGE (row-based z-score)")
# BiocManager::install("EnhancedVolcano")
library(EnhancedVolcano)
## Loading required package: ggrepel
vp1 <- EnhancedVolcano(DGE.results,
lab=rownames(DGE.results),
x='log2FoldChange', y='padj',
xlim = c(-10,10),
pCutoff=0.05,
title="SNF2 / WT")
#print(vp1)
Shrink logFC values of low/noisily expressed genes
# install.packages("patchwork")
# BiocManager::install("apeglm")
library(patchwork)
DGE.results.shrink <- lfcShrink(DESeq.ds, coef=2, type="apeglm")
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
## Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
## sequence count data: removing the noise and preserving large differences.
## Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
vp2 <- EnhancedVolcano(DGE.results.shrink, lab = rownames(DGE.results.shrink), x = "log2FoldChange", y='padj', title="with logFC shrinkage")
vp1 + vp2
#BiocManager::install("org.Hs.eg.db")
library(org.Hs.eg.db)
## Loading required package: AnnotationDbi
## Warning: replacing previous import 'utils::findMatches' by
## 'S4Vectors::findMatches' when loading 'AnnotationDbi'
##
human <- org.Hs.eg.db
# gene_symbols <- mapIds(org.Hs.eg.db, keys=ensembl_ids, column="SYMBOL", keytype="ENSEMBL")
annot.DGE <- select(human, keys=rownames(DGE.results.shrink),
keytype="ENSEMBL", columns="SYMBOL")
## 'select()' returned 1:many mapping between keys and columns
head(annot.DGE)
## ENSEMBL SYMBOL
## 1 ENSG00000227232 <NA>
## 2 ENSG00000278267 MIR6859-1
## 3 ENSG00000240361 OR4G11P
## 4 ENSG00000186092 OR4F5
## 5 ENSG00000238009 <NA>
## 6 ENSG00000233750 CICP27
Find names of most differentially expressed genes
# add ENSEMBL as column instead of rowname
DGE.results.sorted$ENSEMBL <- rownames(DGE.results.sorted)
rownames(DGE.results.sorted) <- NULL
# merge annotation and DGE results on common column ENSEMBL
DGE.results.annot <- merge(as.data.frame(DGE.results.sorted), annot.DGE, by = "ENSEMBL", all = TRUE)
# sort results by p-values
DGE.annot.sorted <- DGE.results.annot %>% `[`(order(.$padj),)
head(DGE.annot.sorted)
## ENSEMBL baseMean log2FoldChange lfcSE stat
## 17421 ENSG00000198695 534.96588 -7.341260 0.6891506 -10.652621
## 20718 ENSG00000212443 292.50629 3.848837 0.3904436 9.857601
## 52176 ENSG00000279978 603.07295 -2.738898 0.3056988 -8.959468
## 20289 ENSG00000210107 82.24442 -7.457597 0.8403652 -8.874234
## 17463 ENSG00000198786 359.81679 -3.679350 0.4296718 -8.563164
## 17283 ENSG00000198216 101.23429 -2.299338 0.2961745 -7.763458
## pvalue padj SYMBOL
## 17421 1.695239e-26 5.586489e-22 ND6
## 20718 6.354927e-23 1.047101e-18 SNORA53
## 52176 3.262493e-19 3.583740e-15 <NA>
## 20289 7.041640e-19 5.801255e-15 <NA>
## 17463 1.098112e-17 7.237437e-14 ND5
## 17283 8.264480e-15 4.539128e-11 CACNA1E
The most important differentially expressed genes are NADH dehydrogenase subunit 6, small nucleolar RNA, H/ACA box 53, NADH dehydrogenase subunit 5, and calcium voltage-gated channel subunit alpha1 E. A lot of these are mitochondrial genes.
Find genes that are highly expressed in the SRR8368152_BP1 sample because it has so many more counts than the other samples.
counts_bp1 <- counts(DESeq.ds)[, "SRR8368152_BP1"]
df_bp1 <- as.data.frame(counts_bp1)
df_bp1$ENSEMBL <- rownames(df_bp1)
rownames(df_bp1) <- NULL
# merge annotation and DGE results on common column ENSEMBL
annot.onesample <- merge(df_bp1, annot.DGE, by = "ENSEMBL", all = TRUE)
onesample.counts <- annot.onesample[order(-annot.onesample[,2]),]
head(onesample.counts)
## ENSEMBL counts_bp1 SYMBOL
## 18637 ENSG00000202198 398745 RN7SK
## 37241 ENSG00000251562 216925 MALAT1
## 1862 ENSG00000090382 48120 LYZ
## 45154 ENSG00000265972 46732 TXNIP
## 10690 ENSG00000162747 35062 FCGR3B
## 10691 ENSG00000162747 35062 LOC124905743
# run gsea
# get just gene symbol and log fold change
#BiocManager::install("fgsea")
library(fgsea)
# take only symbol and log fold change from deg result
# take average of duplicate genes
res <- DGE.annot.sorted %>% dplyr::select(SYMBOL, log2FoldChange) %>%
na.omit() %>% dplyr::group_by(SYMBOL) %>% dplyr::summarize(log2FoldChange=mean(log2FoldChange))
# turn list into dataframe
ranks <- tibble::deframe(res)
# head(ranks, 20)
# import pathways file to use
pathways.hallmark <- gmtPathways("/Users/noelawheeler/Desktop/Analysis of next gen seq data/angsd-project/h.all.v2023.1.Hs.symbols.gmt")
# check pathways look fine
pathways.hallmark %>% head() %>% lapply(head)
## $HALLMARK_TNFA_SIGNALING_VIA_NFKB
## [1] "JUNB" "CXCL2" "ATF3" "NFKBIA" "TNFAIP3" "PTGS2"
##
## $HALLMARK_HYPOXIA
## [1] "PGK1" "PDK1" "GBE1" "PFKL" "ALDOA" "ENO2"
##
## $HALLMARK_CHOLESTEROL_HOMEOSTASIS
## [1] "FDPS" "CYP51A1" "IDI1" "FDFT1" "DHCR7" "SQLE"
##
## $HALLMARK_MITOTIC_SPINDLE
## [1] "ARHGEF2" "CLASP1" "KIF11" "KIF23" "ALS2" "ARF6"
##
## $HALLMARK_WNT_BETA_CATENIN_SIGNALING
## [1] "MYC" "CTNNB1" "JAG2" "NOTCH1" "DLL1" "AXIN2"
##
## $HALLMARK_TGF_BETA_SIGNALING
## [1] "TGFBR1" "SMAD7" "TGFB1" "SMURF2" "SMURF1" "BMPR2"
# run gsea
fgsea.res <- fgseaMultilevel(pathways=pathways.hallmark, stats=ranks)
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (20.31% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
## Warning in fgseaMultilevel(pathways = pathways.hallmark, stats = ranks): There
## were 13 pathways for which P-values were not calculated properly due to
## unbalanced (positive and negative) gene-level statistic values. For such
## pathways pval, padj, NES, log2err are set to NA. You can try to increase the
## value of the argument nPermSimple (for example set it nPermSimple = 10000)
# select certain columns and rank by p value
fgsea.res.order <- fgsea.res %>% dplyr::select(-log2err, -ES, -leadingEdge, ) %>% dplyr::arrange(padj)
head(fgsea.res.order)
## pathway pval padj
## 1: HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION 1.518659e-08 5.619038e-07
## 2: HALLMARK_KRAS_SIGNALING_DN 9.385621e-08 1.736340e-06
## 3: HALLMARK_PANCREAS_BETA_CELLS 3.788518e-05 4.594613e-04
## 4: HALLMARK_XENOBIOTIC_METABOLISM 4.967149e-05 4.594613e-04
## 5: HALLMARK_APICAL_JUNCTION 2.483546e-04 1.837824e-03
## 6: HALLMARK_SPERMATOGENESIS 1.132109e-03 6.981338e-03
## NES size
## 1: -1.362111 197
## 2: -1.330741 193
## 3: -1.496329 37
## 4: -1.258768 197
## 5: -1.219026 194
## 6: -1.237142 132
# view results of gsea normalized enrichment score
library(ggplot2)
# only look at pathways with NES > 0 and NES != na
visualize.fgsea <- fgsea.res.order %>% na.omit(.)
# plot
head(visualize.fgsea)
## pathway pval padj
## 1: HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION 1.518659e-08 5.619038e-07
## 2: HALLMARK_KRAS_SIGNALING_DN 9.385621e-08 1.736340e-06
## 3: HALLMARK_PANCREAS_BETA_CELLS 3.788518e-05 4.594613e-04
## 4: HALLMARK_XENOBIOTIC_METABOLISM 4.967149e-05 4.594613e-04
## 5: HALLMARK_APICAL_JUNCTION 2.483546e-04 1.837824e-03
## 6: HALLMARK_SPERMATOGENESIS 1.132109e-03 6.981338e-03
## NES size
## 1: -1.362111 197
## 2: -1.330741 193
## 3: -1.496329 37
## 4: -1.258768 197
## 5: -1.219026 194
## 6: -1.237142 132
ggplot(visualize.fgsea, aes(reorder(pathway, NES), NES)) +
geom_col(aes(fill=padj<0.05)) + coord_flip() + labs(x="Pathway", y="Normalized Enrichment Score", title="Hallmark pathways from GSEA") + theme(text = element_text(size = 6))
# try kegg pathway
pathways.kegg <- gmtPathways("/Users/noelawheeler/Desktop/Analysis of next gen seq data/angsd-project/c2.cp.kegg.v2023.1.Hs.symbols.gmt")
# check pathways look fine
pathways.kegg %>% head() %>% lapply(head)
## $KEGG_N_GLYCAN_BIOSYNTHESIS
## [1] "ALG13" "DOLPP1" "RPN1" "ALG14" "MAN1B1" "ALG3"
##
## $KEGG_OTHER_GLYCAN_DEGRADATION
## [1] "ENGASE" "GLB1" "MANBA" "MAN2B1" "GBA1" "NEU4"
##
## $KEGG_O_GLYCAN_BIOSYNTHESIS
## [1] "GALNT4" "GALNT15" "GALNTL5" "GALNT6" "GALNT5" "GALNT16"
##
## $KEGG_GLYCOSAMINOGLYCAN_DEGRADATION
## [1] "HS3ST3A1" "HPSE" "HPSE2" "GLB1" "GUSB" "HYAL3"
##
## $KEGG_GLYCOSAMINOGLYCAN_BIOSYNTHESIS_KERATAN_SULFATE
## [1] "CHST2" "B4GAT1" "B3GNT2" "CHST1" "B4GALT1" "B3GNT7"
##
## $KEGG_GLYCEROLIPID_METABOLISM
## [1] "MBOAT2" "GPAM" "LIPG" "DGKZ" "DGKE" "DGKD"
# run gsea
fgsea.kegg <- fgseaMultilevel(pathways=pathways.kegg, stats=ranks)
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (20.31% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
## Warning in fgseaMultilevel(pathways = pathways.kegg, stats = ranks): There were
## 19 pathways for which P-values were not calculated properly due to unbalanced
## (positive and negative) gene-level statistic values. For such pathways pval,
## padj, NES, log2err are set to NA. You can try to increase the value of the
## argument nPermSimple (for example set it nPermSimple = 10000)
## Warning in fgseaMultilevel(pathways = pathways.kegg, stats = ranks): For some
## of the pathways the P-values were likely overestimated. For such pathways
## log2err is set to NA.
# select certain columns and rank by p value
fgsea.res.order <- fgsea.kegg %>% dplyr::select(-log2err, -ES, -leadingEdge, ) %>% dplyr::arrange(padj)
head(fgsea.res.order)
## pathway pval padj
## 1: KEGG_NEUROACTIVE_LIGAND_RECEPTOR_INTERACTION 9.138009e-19 1.526048e-16
## 2: KEGG_CALCIUM_SIGNALING_PATHWAY 1.435912e-11 1.198987e-09
## 3: KEGG_OLFACTORY_TRANSDUCTION 3.445827e-10 1.918177e-08
## 4: KEGG_TIGHT_JUNCTION 8.201777e-09 3.424242e-07
## 5: KEGG_LONG_TERM_POTENTIATION 1.148708e-07 3.836685e-06
## 6: KEGG_DRUG_METABOLISM_CYTOCHROME_P450 5.747162e-07 1.599627e-05
## NES size
## 1: -1.493913 266
## 2: -1.446831 176
## 3: -1.328691 299
## 4: -1.434090 130
## 5: -1.544268 69
## 6: -1.497978 68
# remove na
visualize.kegg <- fgsea.res.order %>% na.omit(.)
# plot
ggplot(visualize.kegg, aes(reorder(pathway, NES), NES)) +
geom_col(aes(fill=padj<0.05)) + coord_flip() + labs(x="Pathway", y="Normalized Enrichment Score", title="Hallmark pathways from GSEA") + theme(text = element_text(size = 3))
visualize.kegg
## pathway pval padj
## 1: KEGG_NEUROACTIVE_LIGAND_RECEPTOR_INTERACTION 9.138009e-19 1.526048e-16
## 2: KEGG_CALCIUM_SIGNALING_PATHWAY 1.435912e-11 1.198987e-09
## 3: KEGG_OLFACTORY_TRANSDUCTION 3.445827e-10 1.918177e-08
## 4: KEGG_TIGHT_JUNCTION 8.201777e-09 3.424242e-07
## 5: KEGG_LONG_TERM_POTENTIATION 1.148708e-07 3.836685e-06
## ---
## 163: KEGG_TOLL_LIKE_RECEPTOR_SIGNALING_PATHWAY 1.000000e+00 1.000000e+00
## 164: KEGG_TYPE_I_DIABETES_MELLITUS 9.880120e-01 1.000000e+00
## 165: KEGG_T_CELL_RECEPTOR_SIGNALING_PATHWAY 1.000000e+00 1.000000e+00
## 166: KEGG_VALINE_LEUCINE_AND_ISOLEUCINE_BIOSYNTHESIS 7.933884e-01 1.000000e+00
## 167: KEGG_VALINE_LEUCINE_AND_ISOLEUCINE_DEGRADATION 9.990010e-01 1.000000e+00
## NES size
## 1: -1.4939126 266
## 2: -1.4468309 176
## 3: -1.3286914 299
## 4: -1.4340901 130
## 5: -1.5442677 69
## ---
## 163: -0.6507879 98
## 164: -0.6553719 42
## 165: -0.5300955 104
## 166: -0.8274047 11
## 167: -0.5079931 44
plotEnrichment(pathways.kegg[["KEGG_NEUROACTIVE_LIGAND_RECEPTOR_INTERACTION"]], ranks) + labs(title="Neuroactive ligand receptor pathway")